set more off
clear all

// ----------------------------------------------------------------------------
// 1. Liberalization
// ----------------------------------------------------------------------------

/**
Import liberalization years from Wacziarg and Welch (2008 WBER)
Note: two year-coding errors fixed in LACIR_plots.do
*/
clear all
import delimited "./data/liberalization_years.csv", encoding("UTF-8")

save "libyears.dta", replace

// ----------------------------------------------------------------------------
// 2. Informality
// ----------------------------------------------------------------------------

/**
The data were downloaded from: 
https://www.cedlas.econo.unlp.edu.ar/wp/en/estadisticas/sedlac/
(click on "Employment", tab stru_type)
Note: Some transformations were done in Excel
*/

clear
import excel "./data/stru_type_tidy.xlsx", cellrange(A10:U424) firstrow

// Keep only relevant columns
keep country source year informal

// Join with data that has liberalization years
replace country = "Costa Rica" if country == "Costa Rica "
merge m:1 country using "libyears.dta"
assert _merge == 3
drop _merge

// Sort by country and year
sort country year

// Label the variable
label variable libyear "Liberalization year"

// Drop if informal is zero, meaning missing
drop if informal == 0

/*
There are some observations with suffixes "-I" and "-II".
These are assumed to be semestral observations.
Hence, we group by country and source, and take the average
*/

{
	// Keep only first four characters and convert to integer
	replace year = substr(year, 1, 4)
	destring year, replace

	// Compute the mean of informal group by country, source and year
	bysort country source year: egen informalshare = mean(informal)

	// Drop duplicates
	duplicates drop country year source, force
}


// Sort observations
sort country year source

// Create groups by source to make data manipulation easier
egen sou = group(source)

// See source codes
ssc install groups
groups source sou 

// Save data
save "informality-complete.dta", replace

// Drop problematic observations
{
	drop if sou==20 & countrycode=="ARG" // only Greater BA
	drop if sou==25 & countrycode=="BOL" // only Urban
	drop if sou==18 & countrycode=="ECU" // only EPED-Urban
	drop if sou==4 & countrycode=="PRY" // only Asunción
	drop if missing(sou) & countrycode=="BRA" // Not specified
	drop if inlist(countrycode, "CHL", "COL", "GTM")

	drop if inlist(year, 1986, 1991) & sou==22 & countrycode=="ARG" // Argentina: two huge outliers for particular data source in 1986 and 1991
	drop if sou==27 & countrycode=="BOL" // Bolivia: two low outliers from this data source
	drop if sou==20 & countrycode=="EDU" // Ecuador: two low outliers from this data source
	drop if inrange(year, 1995, 1996) & countrycode=="SLV" 	// El Salvador: two outliers in 1995 and 1996
}

// See duplicates by country and year
duplicates list country year

// label variable informalshare "Small firms employees, unskilled self-employed and zero income (% of total)"

// Drop columns
drop sou informal

// Generate a single observation by country and year
bysort country year: egen informality = mean(informalshare)

label variable informality "Small firms employees, unskilled self-employed and zero income (% of total)"

// See number of duplicate occurrences by country and year
// 0 is unique, 1 is record is duplicate (1st ocurrence), etc.
quietly by country year: gen dup = cond(_N==1, 0, _n)

// Remove duplicates
keep if dup<=1

// Order columns
keep countrycode year informality

// Save filtered data
save "informality.dta", replace


// ----------------------------------------------------------------------------
// 3. Unemployment share
// ----------------------------------------------------------------------------

/**
The data were downloaded from The World Bank: 
https://data.worldbank.org/indicator/SL.UEM.TOTL.ZS
Last updated on: 2/15/2022
*/

clear all
import delimited "./data/SL.UEM.TOTL.ZS_DS2.csv", rowrange(5:271) varnames(5)

// Drop unused columns
drop v66 countryname indicatorname indicatorcode

// Include the label in the varname
foreach v of var v* {
    local lbl : var label `v'
    local lbl = "v" + "`lbl'"
    rename `v' `lbl'
}

// Reshape from wide to long
reshape long v@, i(countrycode) j(year)

// Rename the column to a more descriptive one
rename v unemployment
label variable unemployment "Unemployment, total (% of total labor force)"

// Drop missing observations
drop if missing(unemployment)

// Get liberalization years from previously saved data
merge m:1 countrycode using "libyears.dta"
assert _merge != 2
keep if _merge == 3
drop _merge
label variable libyear "Liberalization year"

// Sort observations
sort country year

// Sanity check
* table countrycode

keep countrycode year unemployment

// Save data
save "unemployment.dta", replace


// ----------------------------------------------------------------------------
// 4. WIID Gini
// ----------------------------------------------------------------------------

/*
   Source: World Development Indicators (WDI)
   Link: https://blogs.worldbank.org/opendata/accessing-world-bank-open-data-stata
*/


*****
***** uncomment the following line to install the wbopendata command
*****
*ssc install wbopendata
wbopendata, language(en – English) indicators(SI.POV.GINI; NE.TRD.GNFS.ZS) long clear

// Rename variables
rename ne_trd_gnfs_zs openness
label variable openness "Trade (% of GDP)"

// Rename the column to a more descriptive one
rename si_pov_gini gini
label variable gini "Gini index (World Bank estimate)"

// Remove unused columns
keep countrycode year gini openness

// Sort observations
sort countrycode year

// Create a source column for consistency
gen source_detailed = "iWDI (imported)"

save "WDI2020.dta", replace


/**
Source: World Income Inequality Database (WIID)
Version: 21 May, 2021
Link: https://www.wider.unu.edu/database/previous-versions-wiid
*/

use "./data/WIID_31MAY2021.dta", replace


// Convert all names to lower case
// rename *, lower

// Rename countrycode
rename c3 countrycode

// Keep used columns
// keep countrycode year source_detailed gini

// Append WDI data
append using "WDI2020.dta", keep(countrycode year gini source_detailed)

// Merge in liberalization years to filter for Latin American countries
merge m:1 countrycode using "libyears.dta"
assert _merge != 2
keep if _merge == 3
drop _merge
label variable libyear "Liberalization year"

// Create sou variable for consistency
gen sou = substr(source_detailed, 1, 3)

// See all sources
groups source_detailed sou

// Rename SEDLAC for consistency
// replace sou = "SED" if sou=="Soc"

// Keep same sources used in Bellon's
drop if !inlist(sou, "Cer", "Fis", "SED") & countrycode=="ARG"
drop if !inlist(sou, "Sze", "SED", "iWD") & countrycode=="BOL"
drop if !inlist(sou, "SED", "iWD") & inlist(countrycode, "BRA", "COL", "URY")
drop if !inlist(sou, "Far", "Chi", "SED", "iWD") & countrycode=="CHL"
drop if !inlist(sou, "Psa", "Sau", "Fie", "Dei", "SED", "iWD") & countrycode=="CRI"
drop if !inlist(sou, "Dei", "iWD") & countrycode=="DOM"
drop if !inlist(sou, "SED", "Dei", "iWD") & countrycode=="ECU"
drop if !inlist(sou, "Jai", "SED", "iWD") & countrycode=="SLV"
drop if !inlist(sou, "Dei", "Psa", "IAD", "iWD") & countrycode=="GTM"
drop if !inlist(sou, "SED", "Dei", "iWD") & countrycode=="HND"
drop if !inlist(sou, "Lux", "SED", "OEC", "iWD") & countrycode=="MEX"
drop if !inlist(sou, "SED", "Dei", "iWD") & countrycode=="PAN"
drop if !inlist(sou, "Psa", "SED", "Sze", "iWD") & countrycode=="PER"
drop if !inlist(sou, "Psa", "SED", "iWD") & countrycode=="PRY"
drop if !inlist(sou, "Rod", "SED") & countrycode=="VEN"
drop if countrycode=="NIC"

// See codebook
labelbook areacovr_detailed

// Other filters
drop if countrycode=="CHL" & source_comments=="CEPAL 1987" & sou=="Far"
drop if countrycode=="CHL" & areacovr_detailed!=303 & sou=="Far"
drop if countrycode=="MEX" & areacovr_detailed!=303 & sou=="Lux"
drop if countrycode=="PER" & inlist(areacovr_detailed, 201, 301) & sou=="Sze"

// Sanity check
groups countrycode sou

// See min and max by year
bys countrycode: egen yearmin = min(year)
bys countrycode: egen yearmax = max(year)
groups countrycode yearmin yearmax

// Drop columns
drop yearm*

// Drop source column and rename it for consistency
drop source
rename source_detailed source

// Save data
sort countrycode year id
save "WIID-2020-cleaned.dta", replace

/** To ensure stability in replication, the following block is commented out
    and the resulting data file WDI-libyear-population.dta, is used directly below

*Population in the liberalization year
*Source: World Development Indicators (WDI)
*Link: https://blogs.worldbank.org/opendata/accessing-world-bank-open-data-stata
*Note: Make sure to install wbopendata using: ssc install wbopendata
*/

wbopendata, language(en – English) indicator(SP.POP.TOTL) long clear

// Rename columns
rename sp_pop_totl population
label variable population "Total population (of liberalization year)"

rename year libyear

// Merge in liberalization years to filter for Latin American countries
merge m:1 countrycode libyear using "libyears.dta"
assert _merge != 2
keep if _merge == 3
drop _merge
label variable libyear "Liberalization year"

// Remove unused columns
keep countrycode libyear population

// Save data
save "WDI-libyear-population.dta", replace

// ----------------------------------------------------------------------------
// 5. All outcomes combined
// ----------------------------------------------------------------------------

use "WIID-2020-cleaned.dta", clear

// Generate a single observation by country and year
bysort countrycode year: egen gini_hat = mean(gini)
label variable gini_hat "Estimated Gini coefficient (simple average)"

// See number of duplicate occurrences by country and year
// 0 is unique, 1 is record is duplicate (1st ocurrence), etc.
quietly by countrycode year: gen dup = cond(_N==1, 0, _n)

// Sort observations
sort countrycode year

// Remove duplicates
keep if dup<=1

// Drop unused columns
keep countrycode year gini_hat

// Drop duplicates
duplicates drop countrycode year, force
	
// Append openness
merge 1:1 countrycode year using "WDI2020.dta"
drop _merge
drop gini source_detailed

// Append informality
merge 1:1 countrycode year using "informality.dta"
drop _merge

// Append unemployment 
merge 1:1 countrycode year using "unemployment.dta"
drop _merge

// Append liberalization years and population
merge m:1 countrycode using "WDI-libyear-population.dta"
drop _merge 

// Reorder columns
order countrycode year libyear population
sort countrycode year

// Sanity check
groups countrycode

// Check if there are duplicates by country and year
duplicates list country year

// Save data
save "inequality-outcomes-2020.dta", replace

